On importe les librairies nécessaires pour le projet.
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from gensim.models import KeyedVectors
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import pycountry_convert as pc
On commence par observer les données.
titles = pd.read_csv("credits.csv")
titles.head()
| person_id | id | name | character | role | |
|---|---|---|---|---|---|
| 0 | 3748 | tm84618 | Robert De Niro | Travis Bickle | ACTOR |
| 1 | 14658 | tm84618 | Jodie Foster | Iris Steensma | ACTOR |
| 2 | 7064 | tm84618 | Albert Brooks | Tom | ACTOR |
| 3 | 3739 | tm84618 | Harvey Keitel | Matthew 'Sport' Higgins | ACTOR |
| 4 | 48933 | tm84618 | Cybill Shepherd | Betsy | ACTOR |
credits = pd.read_csv("titles.csv")
credits
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV-MA | 51 | ['documentation'] | ['US'] | 1.0 | NaN | NaN | NaN | 0.600 | NaN |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | R | 114 | ['drama', 'crime'] | ['US'] | NaN | tt0075314 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | R | 109 | ['drama', 'action', 'thriller', 'european'] | ['US'] | NaN | tt0068473 | 7.7 | 107673.0 | 10.010 | 7.300 |
| 3 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | PG | 91 | ['fantasy', 'action', 'comedy'] | ['GB'] | NaN | tt0071853 | 8.2 | 534486.0 | 15.461 | 7.811 |
| 4 | tm120801 | The Dirty Dozen | MOVIE | 12 American military prisoners in World War II... | 1967 | NaN | 150 | ['war', 'action'] | ['GB', 'US'] | NaN | tt0061578 | 7.7 | 72662.0 | 20.398 | 7.600 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5845 | tm1014599 | Fine Wine | MOVIE | A beautiful love story that can happen between... | 2021 | NaN | 100 | ['romance', 'drama'] | ['NG'] | NaN | tt13857480 | 6.8 | 45.0 | 1.466 | NaN |
| 5846 | tm898842 | C/O Kaadhal | MOVIE | A heart warming film that explores the concept... | 2021 | NaN | 134 | ['drama'] | [] | NaN | tt11803618 | 7.7 | 348.0 | NaN | NaN |
| 5847 | tm1059008 | Lokillo | MOVIE | A controversial TV host and comedian who has b... | 2021 | NaN | 90 | ['comedy'] | ['CO'] | NaN | tt14585902 | 3.8 | 68.0 | 26.005 | 6.300 |
| 5848 | tm1035612 | Dad Stop Embarrassing Me - The Afterparty | MOVIE | Jamie Foxx, David Alan Grier and more from the... | 2021 | PG-13 | 37 | [] | ['US'] | NaN | NaN | NaN | NaN | 1.296 | 10.000 |
| 5849 | ts271048 | Mighty Little Bheem: Kite Festival | SHOW | With winter behind them, Bheem and his townspe... | 2021 | NaN | 7 | ['family', 'animation', 'comedy'] | [] | 1.0 | tt13711094 | 7.8 | 18.0 | 2.289 | 10.000 |
5850 rows × 15 columns
Sur Kaggle on recupère les descriptions des colonnes.
Nous allons d'abord étudier les informations générales sur notre dataframe.
titles.shape
(77801, 5)
titles.columns.values
array(['person_id', 'id', 'name', 'character', 'role'], dtype=object)
credits.shape
(5850, 15)
credits.columns.values
array(['id', 'title', 'type', 'description', 'release_year',
'age_certification', 'runtime', 'genres', 'production_countries',
'seasons', 'imdb_id', 'imdb_score', 'imdb_votes',
'tmdb_popularity', 'tmdb_score'], dtype=object)
Ensuite, nous allons traiter et analyser les objets nan. Nous calculons les pourcentages de nan par colonne dans chaque dataframe.
(titles.isna().sum(axis=0)/titles.shape[0]*100).round(2).sort_values(ascending=False)
character 12.56 person_id 0.00 id 0.00 name 0.00 role 0.00 dtype: float64
(credits.isna().sum(axis=0)/credits.shape[0]*100).round(2).sort_values(ascending=False)
seasons 64.00 age_certification 44.77 imdb_votes 8.51 imdb_score 8.24 imdb_id 6.89 tmdb_score 5.32 tmdb_popularity 1.56 description 0.31 title 0.02 id 0.00 type 0.00 release_year 0.00 runtime 0.00 genres 0.00 production_countries 0.00 dtype: float64
titles[titles["character"].isna()].shape
(9772, 5)
Pour corriger les 12,56% de noms de personnages qui ne sont pas mentionnés dans l'ensemble des données, nous les remplacerons par "Non mentionné".
titles["character"] = titles["character"].fillna("Not mentioned")
Maintenant, nous pouvons supprimer les doublons dans ce dataframe, s'il y en a.
titles.drop_duplicates(inplace=True)
Nous vérifions que les titres sont nettoyés des valeurs nan. Nous vérifions également ses informations générales
titles.shape
(77801, 5)
(titles.isna().sum(axis=0)/titles.shape[0]*100).round(2).sort_values(ascending=False)
person_id 0.0 id 0.0 name 0.0 character 0.0 role 0.0 dtype: float64
Nous remarquons que les saisons ont 64% d'objets nan. C'est tout à fait normal. En fait, cette colonne est égale au nombre de saisons si l'observation est une série et Nan si l'observation est un film. Ainsi Nan peut être remplacé par 0 car un film possède 0 saisons.
credits["seasons"] = credits["seasons"].fillna(0)
Nous allons maintenant traiter la deuxième colonne la plus problématique en proportion de Nan, age_certification. Les valeurs nan sont celles qui n'ont pas de certification. Cela signifie que nous pouvons les remplacer par "no_certif" par exemple.
credits["age_certification"] = credits["age_certification"].fillna("no_certif")
Nous remarquons également que les valeurs de certification de l'âge ne sont pas explicites. Mettons en correspondance de nouvelles valeurs pour cette colonne.
mapAC = {
"G":"Motion Picture : General Audiences",
"PG":"Motion Picture : Parental Guidance Suggested",
"PG-13":"Motion Picture : Parents Strongly Cautioned",
"R":"Motion Picture : Restricted",
"NC-17":"Motion Picture : Adults Only",
"TV-Y":"TV : All_children",
"TV-Y7":"TV : Directed to Older Children",
"TV-G":"TV : General Audience",
"TV-PG":"TV : Parental Guidance Suggested",
"TV-14":"TV : Parents Strongly Cautioned",
"TV-MA":"TV : Mature Audience Only",
"no_certif" : "No Certification"
}
Si nous voulons revenir en arrière, nous créons le mapping inverse.
mapAC_reverse = {value: key for key, value in mapAC.items()}
Ensuite, nous pouvons appliquer le mapping.
credits["age_certification"] = credits["age_certification"].map(mapAC)
Maintenant, nous étudions les rubriques liées à imdb.
credits[["imdb_id", "imdb_score", "imdb_votes"]]
| imdb_id | imdb_score | imdb_votes | |
|---|---|---|---|
| 0 | NaN | NaN | NaN |
| 1 | tt0075314 | 8.2 | 808582.0 |
| 2 | tt0068473 | 7.7 | 107673.0 |
| 3 | tt0071853 | 8.2 | 534486.0 |
| 4 | tt0061578 | 7.7 | 72662.0 |
| ... | ... | ... | ... |
| 5845 | tt13857480 | 6.8 | 45.0 |
| 5846 | tt11803618 | 7.7 | 348.0 |
| 5847 | tt14585902 | 3.8 | 68.0 |
| 5848 | NaN | NaN | NaN |
| 5849 | tt13711094 | 7.8 | 18.0 |
5850 rows × 3 columns
imdb_id n'est lié à aucune autre colonne des dataframes crédits ou titres. Elle n'apporte donc pas beaucoup d'informations et nous pouvons simplement la supprimer.
credits.drop("imdb_id", axis=1, inplace=True)
Nous continuerons à travailler sur les variables imbd plus tard dans ce notebook.
La colonne description est en quelque sorte similaire aux autres colonnes qualitatives. Lorsque la description est nulle, nous pouvons simplement remplacer la cellule par "No description" par exemple.
credits["description"] = credits["description"].fillna("No description")
Quand on regarde les données de la colonne title, on peut observer que 0,02% des données sont des nan, ce qui est un très faible pourcentage de la data. De plus, un film sans titre n'est pas intéressant. Ainsi, nous supprimons les 0,02% de Nan de cette colonne.
credits = credits.dropna(subset=["title"])
credits.reset_index(drop=True)
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV : Mature Audience Only | 51 | ['documentation'] | ['US'] | 1.0 | NaN | NaN | 0.600 | NaN |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | Motion Picture : Restricted | 114 | ['drama', 'crime'] | ['US'] | 0.0 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | Motion Picture : Restricted | 109 | ['drama', 'action', 'thriller', 'european'] | ['US'] | 0.0 | 7.7 | 107673.0 | 10.010 | 7.300 |
| 3 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | Motion Picture : Parental Guidance Suggested | 91 | ['fantasy', 'action', 'comedy'] | ['GB'] | 0.0 | 8.2 | 534486.0 | 15.461 | 7.811 |
| 4 | tm120801 | The Dirty Dozen | MOVIE | 12 American military prisoners in World War II... | 1967 | No Certification | 150 | ['war', 'action'] | ['GB', 'US'] | 0.0 | 7.7 | 72662.0 | 20.398 | 7.600 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5844 | tm1014599 | Fine Wine | MOVIE | A beautiful love story that can happen between... | 2021 | No Certification | 100 | ['romance', 'drama'] | ['NG'] | 0.0 | 6.8 | 45.0 | 1.466 | NaN |
| 5845 | tm898842 | C/O Kaadhal | MOVIE | A heart warming film that explores the concept... | 2021 | No Certification | 134 | ['drama'] | [] | 0.0 | 7.7 | 348.0 | NaN | NaN |
| 5846 | tm1059008 | Lokillo | MOVIE | A controversial TV host and comedian who has b... | 2021 | No Certification | 90 | ['comedy'] | ['CO'] | 0.0 | 3.8 | 68.0 | 26.005 | 6.300 |
| 5847 | tm1035612 | Dad Stop Embarrassing Me - The Afterparty | MOVIE | Jamie Foxx, David Alan Grier and more from the... | 2021 | Motion Picture : Parents Strongly Cautioned | 37 | [] | ['US'] | 0.0 | NaN | NaN | 1.296 | 10.000 |
| 5848 | ts271048 | Mighty Little Bheem: Kite Festival | SHOW | With winter behind them, Bheem and his townspe... | 2021 | No Certification | 7 | ['family', 'animation', 'comedy'] | [] | 1.0 | 7.8 | 18.0 | 2.289 | 10.000 |
5849 rows × 14 columns
Nous remarquons que les colonnes pays et genre sont des listes. Cela les rend difficiles à utiliser dans nos opérations. Nous décidons de ne garder que le premier élément de chaque liste, le considérant comme le plus important. De plus, nous supprimons toutes les lignes qui n'ont pas de pays de production.
credits = credits.drop(credits[credits['genres'] == '[]'].index)
credits['genres'] = [x.split("['")[1].split("'")[0] for x in credits["genres"]]
credits = credits.drop(credits[credits['production_countries'] == '[]'].index)
credits['production_countries'] = [x.split("['")[1].split("'")[0] for x in credits["production_countries"]]
Maintenant, nous faisons correspondre le nom des pays avec leur code pays.
map_country_code ={
"AF": "Afghanistan",
"AX": "Aland Islands",
"AL": "Albania",
"DZ": "Algeria",
"AS": "American Samoa",
"AD": "Andorra",
"AO": "Angola",
"AI": "Anguilla",
"AQ": "Antarctica",
"AG": "Antigua And Barbuda",
"AR": "Argentina",
"AM": "Armenia",
"AW": "Aruba",
"AU": "Australia",
"AT": "Austria",
"AZ": "Azerbaijan",
"BS": "Bahamas",
"BH": "Bahrain",
"BD": "Bangladesh",
"BB": "Barbados",
"BY": "Belarus",
"BE": "Belgium",
"BZ": "Belize",
"BJ": "Benin",
"BM": "Bermuda",
"BT": "Bhutan",
"BO": "Bolivia",
"BA": "Bosnia And Herzegovina",
"BW": "Botswana",
"BV": "Bouvet Island",
"BR": "Brazil",
"IO": "British Indian Ocean Territory",
"BN": "Brunei Darussalam",
"BG": "Bulgaria",
"BF": "Burkina Faso",
"BI": "Burundi",
"KH": "Cambodia",
"CM": "Cameroon",
"CA": "Canada",
"CV": "Cape Verde",
"KY": "Cayman Islands",
"CF": "Central African Republic",
"TD": "Chad",
"CL": "Chile",
"CN": "China",
"CX": "Christmas Island",
"CC": "Cocos (Keeling) Islands",
"CO": "Colombia",
"KM": "Comoros",
"CG": "Congo",
"CD": "Congo, Democratic Republic",
"CK": "Cook Islands",
"CR": "Costa Rica",
"CI": "Cote D\"Ivoire",
"HR": "Croatia",
"CU": "Cuba",
"CY": "Cyprus",
"CZ": "Czech Republic",
"DK": "Denmark",
"DJ": "Djibouti",
"DM": "Dominica",
"DO": "Dominican Republic",
"EC": "Ecuador",
"EG": "Egypt",
"SV": "El Salvador",
"GQ": "Equatorial Guinea",
"ER": "Eritrea",
"EE": "Estonia",
"ET": "Ethiopia",
"FK": "Falkland Islands (Malvinas)",
"FO": "Faroe Islands",
"FJ": "Fiji",
"FI": "Finland",
"FR": "France",
"GF": "French Guiana",
"PF": "French Polynesia",
"TF": "French Southern Territories",
"GA": "Gabon",
"GM": "Gambia",
"GE": "Georgia",
"DE": "Germany",
"GH": "Ghana",
"GI": "Gibraltar",
"GR": "Greece",
"GL": "Greenland",
"GD": "Grenada",
"GP": "Guadeloupe",
"GU": "Guam",
"GT": "Guatemala",
"GG": "Guernsey",
"GN": "Guinea",
"GW": "Guinea-Bissau",
"GY": "Guyana",
"HT": "Haiti",
"HM": "Heard Island & Mcdonald Islands",
"VA": "Holy See (Vatican City State)",
"HN": "Honduras",
"HK": "Hong Kong",
"HU": "Hungary",
"IS": "Iceland",
"IN": "India",
"ID": "Indonesia",
"IR": "Iran, Islamic Republic Of",
"IQ": "Iraq",
"IE": "Ireland",
"IM": "Isle Of Man",
"IL": "Israel",
"IT": "Italy",
"JM": "Jamaica",
"JP": "Japan",
"JE": "Jersey",
"JO": "Jordan",
"KZ": "Kazakhstan",
"KE": "Kenya",
"KI": "Kiribati",
"KR": "Korea",
"KP": "North Korea",
"KW": "Kuwait",
"KG": "Kyrgyzstan",
"LA": "Lao People\"s Democratic Republic",
"LV": "Latvia",
"LB": "Lebanon",
"LS": "Lesotho",
"LR": "Liberia",
"LY": "Libyan Arab Jamahiriya",
"LI": "Liechtenstein",
"LT": "Lithuania",
"Lebanon": "Lebanon",
"LU": "Luxembourg",
"MO": "Macao",
"MK": "Macedonia",
"MG": "Madagascar",
"MW": "Malawi",
"MY": "Malaysia",
"MV": "Maldives",
"ML": "Mali",
"MT": "Malta",
"MH": "Marshall Islands",
"MQ": "Martinique",
"MR": "Mauritania",
"MU": "Mauritius",
"YT": "Mayotte",
"MX": "Mexico",
"FM": "Micronesia, Federated States Of",
"MD": "Moldova",
"MC": "Monaco",
"MN": "Mongolia",
"ME": "Montenegro",
"MS": "Montserrat",
"MA": "Morocco",
"MZ": "Mozambique",
"MM": "Myanmar",
"NA": "Namibia",
"NR": "Nauru",
"NP": "Nepal",
"NL": "Netherlands",
"AN": "Netherlands Antilles",
"NC": "New Caledonia",
"NZ": "New Zealand",
"NI": "Nicaragua",
"NE": "Niger",
"NG": "Nigeria",
"NU": "Niue",
"NF": "Norfolk Island",
"MP": "Northern Mariana Islands",
"NO": "Norway",
"OM": "Oman",
"PK": "Pakistan",
"PW": "Palau",
"PS": "Palestinian Territory, Occupied",
"PA": "Panama",
"PG": "Papua New Guinea",
"PY": "Paraguay",
"PE": "Peru",
"PH": "Philippines",
"PN": "Pitcairn",
"PL": "Poland",
"PT": "Portugal",
"PR": "Puerto Rico",
"QA": "Qatar",
"RE": "Reunion",
"RO": "Romania",
"RU": "Russian Federation",
"RW": "Rwanda",
"BL": "Saint Barthelemy",
"SH": "Saint Helena",
"KN": "Saint Kitts And Nevis",
"LC": "Saint Lucia",
"MF": "Saint Martin",
"PM": "Saint Pierre And Miquelon",
"VC": "Saint Vincent And Grenadines",
"WS": "Samoa",
"SM": "San Marino",
"ST": "Sao Tome And Principe",
"SA": "Saudi Arabia",
"SN": "Senegal",
"RS": "Serbia",
"SC": "Seychelles",
"SL": "Sierra Leone",
"SG": "Singapore",
"SK": "Slovakia",
"SI": "Slovenia",
"SB": "Solomon Islands",
"SO": "Somalia",
"ZA": "South Africa",
"GS": "South Georgia And Sandwich Isl.",
"ES": "Spain",
"LK": "Sri Lanka",
"SD": "Sudan",
"SR": "Suriname",
"SJ": "Svalbard And Jan Mayen",
"SZ": "Swaziland",
"SE": "Sweden",
"CH": "Switzerland",
"SY": "Syrian Arab Republic",
"TW": "Taiwan",
"TJ": "Tajikistan",
"TZ": "Tanzania",
"TH": "Thailand",
"TL": "Timor-Leste",
"TG": "Togo",
"TK": "Tokelau",
"TO": "Tonga",
"TT": "Trinidad And Tobago",
"TN": "Tunisia",
"TR": "Turkey",
"TM": "Turkmenistan",
"TC": "Turks And Caicos Islands",
"TV": "Tuvalu",
"UG": "Uganda",
"UA": "Ukraine",
"AE": "United Arab Emirates",
"GB": "United Kingdom",
"US": "United States",
"UM": "United States Outlying Islands",
"UY": "Uruguay",
"UZ": "Uzbekistan",
"VU": "Vanuatu",
"VE": "Venezuela",
"VN": "Vietnam",
"VG": "Virgin Islands, British",
"VI": "Virgin Islands, U.S.",
"WF": "Wallis And Futuna",
"EH": "Western Sahara",
"YE": "Yemen",
"ZM": "Zambia",
"ZW": "Zimbabwe"
}
map_country_code["SU"] = "Russia" #actually it is URSS but it's easier to treat it as russia.
map_country_code["AE"] = "United Arab Emirates"
map_country_code["Lebanon"] = "Lebanon"
credits["production_countries"] = credits["production_countries"].map(map_country_code)
credits = credits[credits["production_countries"].isna()==False]
credits.head()
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV : Mature Audience Only | 51 | documentation | United States | 1.0 | NaN | NaN | 0.600 | NaN |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | Motion Picture : Restricted | 114 | drama | United States | 0.0 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | Motion Picture : Restricted | 109 | drama | United States | 0.0 | 7.7 | 107673.0 | 10.010 | 7.300 |
| 3 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | Motion Picture : Parental Guidance Suggested | 91 | fantasy | United Kingdom | 0.0 | 8.2 | 534486.0 | 15.461 | 7.811 |
| 4 | tm120801 | The Dirty Dozen | MOVIE | 12 American military prisoners in World War II... | 1967 | No Certification | 150 | war | United Kingdom | 0.0 | 7.7 | 72662.0 | 20.398 | 7.600 |
Nous observons des NaN dans les colonnes suivantes: imdb_score, imdb_votes, tmdb_popularity, tmdb_score. Pour corriger les NaN, nous allons les predire. Le problème majeur de cette prediction est le fait que les genres sont sous formes de texte. Pour remedier a cela nous allons load un modèle de Vectorisation entrainé sur des articles Google et ainsi vectoriser les genres.
df = credits[["id","release_year","runtime","genres","seasons","imdb_score","imdb_votes","tmdb_popularity","tmdb_score"]]
id = df['id']
del df['id']
work = df.copy()
del df['imdb_votes']
del df['tmdb_popularity']
del df['tmdb_score']
# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('D:/COURS/A4/S7 - ESILV/Python for Data Analysis/TD/PW6/GoogleNews-vectors-negative300.bin', binary=True)
vect = []
for x in df['genres']:
vect.append(model[x])
genresV = pd.DataFrame(vect)
essai = df.reset_index()
df = pd.concat([essai,genresV], axis = 1)
del df['genres']
del df['index']
test= df
test['imdb_score'] = test['imdb_score'].fillna(-1)
test = test.drop(test[test['imdb_score'] == -1].index)
On obtient donc une DataFrame avec 300 colonnes par genre: ce sont les vecteurs.
test
| release_year | runtime | seasons | imdb_score | 0 | 1 | 2 | 3 | 4 | 5 | ... | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1976 | 114 | 0.0 | 8.2 | 0.199219 | 0.038574 | 0.013184 | 0.057861 | 0.175781 | 0.046631 | ... | -0.157227 | -0.129883 | -0.267578 | -0.228516 | 0.109375 | 0.040527 | -0.104004 | -0.271484 | -0.077148 | 0.189453 |
| 2 | 1972 | 109 | 0.0 | 7.7 | 0.199219 | 0.038574 | 0.013184 | 0.057861 | 0.175781 | 0.046631 | ... | -0.157227 | -0.129883 | -0.267578 | -0.228516 | 0.109375 | 0.040527 | -0.104004 | -0.271484 | -0.077148 | 0.189453 |
| 3 | 1975 | 91 | 0.0 | 8.2 | 0.306641 | -0.169922 | 0.066406 | 0.065430 | 0.018311 | 0.109375 | ... | -0.068359 | 0.402344 | 0.137695 | -0.090820 | -0.166992 | -0.125000 | 0.042725 | 0.100586 | 0.030029 | 0.114746 |
| 4 | 1967 | 150 | 0.0 | 7.7 | 0.339844 | 0.304688 | 0.098633 | 0.167969 | -0.056396 | 0.023315 | ... | -0.199219 | -0.230469 | -0.161133 | 0.289062 | 0.062012 | -0.131836 | -0.137695 | 0.000153 | 0.082031 | 0.208984 |
| 5 | 1969 | 30 | 4.0 | 8.8 | -0.029541 | -0.058350 | -0.002136 | 0.347656 | -0.025635 | -0.091309 | ... | 0.076660 | 0.289062 | -0.386719 | -0.177734 | -0.072266 | 0.175781 | 0.026611 | -0.131836 | 0.093262 | 0.062988 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5558 | 2022 | 102 | 0.0 | 7.0 | -0.029541 | -0.058350 | -0.002136 | 0.347656 | -0.025635 | -0.091309 | ... | 0.076660 | 0.289062 | -0.386719 | -0.177734 | -0.072266 | 0.175781 | 0.026611 | -0.131836 | 0.093262 | 0.062988 |
| 5559 | 2021 | 115 | 0.0 | 7.1 | -0.029541 | -0.058350 | -0.002136 | 0.347656 | -0.025635 | -0.091309 | ... | 0.076660 | 0.289062 | -0.386719 | -0.177734 | -0.072266 | 0.175781 | 0.026611 | -0.131836 | 0.093262 | 0.062988 |
| 5560 | 2021 | 93 | 0.0 | 5.0 | 0.237305 | -0.187500 | -0.250000 | 0.144531 | -0.035645 | 0.000793 | ... | -0.107910 | 0.135742 | -0.265625 | -0.181641 | 0.200195 | -0.102539 | 0.197266 | -0.128906 | -0.056396 | 0.347656 |
| 5561 | 2021 | 100 | 0.0 | 6.8 | 0.237305 | -0.187500 | -0.250000 | 0.144531 | -0.035645 | 0.000793 | ... | -0.107910 | 0.135742 | -0.265625 | -0.181641 | 0.200195 | -0.102539 | 0.197266 | -0.128906 | -0.056396 | 0.347656 |
| 5562 | 2021 | 90 | 0.0 | 3.8 | -0.029541 | -0.058350 | -0.002136 | 0.347656 | -0.025635 | -0.091309 | ... | 0.076660 | 0.289062 | -0.386719 | -0.177734 | -0.072266 | 0.175781 | 0.026611 | -0.131836 | 0.093262 | 0.062988 |
5170 rows × 304 columns
Pour faire la prediction nous allons utiliser un KNN.
On commence par diviser la data en X et Y, tel que X sont les prédicteurs et Y les valeurs prédites.
y = test['imdb_score']
x = test.drop(['imdb_score'], axis = 1)
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)
scaler = StandardScaler()
# Fit only on X_train
scaler.fit(X_train)
# Scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)
KNeighborsRegressor()
df['imdb_score'] = df['imdb_score'].fillna(-1)
test = df
topredict = test[test['imdb_score'] == -1]
del topredict['imdb_score']
topredict = scaler.transform(topredict)
y_pred = regressor.predict(topredict)
k = 0
for i in range(test.shape[0]):
if test.iloc[i]['imdb_score'] ==-1.0:
test.iat[i,3] = y_pred[k]
k = k+1
Ici, nous avons fini de prédire notre premiere colonne.
test['imdb_votes'] = work['imdb_votes']
save = test.copy()
test['imdb_votes'] = test['imdb_votes'].fillna(-1)
test = test.drop(test[test['imdb_votes'] == -1].index)
y = test['imdb_votes']
x = test.drop(['imdb_votes'], axis = 1)
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)
scaler = StandardScaler()
# Fit only on X_train
scaler.fit(X_train)
# Scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)
KNeighborsRegressor()
save['imdb_votes'] = df['imdb_votes'].fillna(-1)
test = save
topredict = test[test['imdb_votes'] == -1]
del topredict['imdb_votes']
topredict = scaler.transform(topredict)
y_pred = regressor.predict(topredict)
k = 0
for i in range(test.shape[0]):
if test.iloc[i]['imdb_votes'] ==-1.0:
test.iat[i,304] = y_pred[k]
k = k+1
La deuxieme colonne à prédire est nettoyée de ces nan, nous pouvons passer à la 3ème.
test['tmdb_popularity'] = work['tmdb_popularity']
save = test.copy()
test['tmdb_popularity'] = test['tmdb_popularity'].fillna(-1)
test = test.drop(test[test['tmdb_popularity'] == -1].index)
y = test['tmdb_popularity']
x = test.drop(['tmdb_popularity'], axis = 1)
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only on X_train
scaler.fit(X_train)
# Scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)
KNeighborsRegressor()
save['tmdb_popularity'] = save['tmdb_popularity'].fillna(-1)
test = save
topredict = test[test['tmdb_popularity'] == -1]
del topredict['tmdb_popularity']
topredict = scaler.transform(topredict)
y_pred = regressor.predict(topredict)
k = 0
for i in range(test.shape[0]):
if test.iloc[i]['tmdb_popularity'] ==-1.0:
test.iat[i,305] = y_pred[k]
k = k+1
Enfin, nous passons à la 4ème.
test['tmdb_score'] = work['tmdb_score']
save = test.copy()
test['tmdb_score'] = test['tmdb_score'].fillna(-1)
test = test.drop(test[test['tmdb_score'] == -1].index)
y = test['tmdb_score']
x = test.drop(['tmdb_score'], axis = 1)
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=SEED)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only on X_train
scaler.fit(X_train)
# Scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)
save['tmdb_score'] = save['tmdb_score'].fillna(-1)
test = save
topredict = test[test['tmdb_score'] == -1]
del topredict['tmdb_score']
topredict = scaler.transform(topredict)
y_pred = regressor.predict(topredict)
k = 0
for i in range(test.shape[0]):
if test.iloc[i]['tmdb_score'] ==-1.0:
test.iat[i,306] = y_pred[k]
k = k+1
test["id"] = id
df = test.copy()
df = df[['id','release_year','runtime','seasons','imdb_score','imdb_votes','tmdb_popularity','tmdb_score']]
On obtient donc une dataframes avec les 4 colonnes à prédire sans aucun NaN :
df.head(3)
| id | release_year | runtime | seasons | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | 1945 | 51 | 1.0 | 7.3 | 14216.6 | 0.600 | 7.260 |
| 1 | tm84618 | 1976 | 114 | 0.0 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | 1972 | 109 | 0.0 | 7.7 | 107673.0 | 10.010 | 7.300 |
credits.head(3)
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV : Mature Audience Only | 51 | documentation | United States | 1.0 | NaN | NaN | 0.600 | NaN |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | Motion Picture : Restricted | 114 | drama | United States | 0.0 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | Motion Picture : Restricted | 109 | drama | United States | 0.0 | 7.7 | 107673.0 | 10.010 | 7.300 |
newcre = credits.copy()
newcre.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
newcre["imdb_score"] = df["imdb_score"]
newcre["imdb_votes"] = df["imdb_votes"]
newcre["tmdb_popularity"] = df["tmdb_popularity"]
newcre["tmdb_score"] = df["tmdb_score"]
credits = newcre.copy()
Maintenant, éliminons les doublons s'il y en a dans le dataframe credits.
credits.drop_duplicates(inplace=True)
credits.head(5)
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV : Mature Audience Only | 51 | documentation | United States | 1.0 | 7.3 | 14216.6 | 0.600 | 7.260 |
| 1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | Motion Picture : Restricted | 114 | drama | United States | 0.0 | 8.2 | 808582.0 | 40.965 | 8.179 |
| 2 | tm154986 | Deliverance | MOVIE | Intent on seeing the Cahulawassee River before... | 1972 | Motion Picture : Restricted | 109 | drama | United States | 0.0 | 7.7 | 107673.0 | 10.010 | 7.300 |
| 3 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | Motion Picture : Parental Guidance Suggested | 91 | fantasy | United Kingdom | 0.0 | 8.2 | 534486.0 | 15.461 | 7.811 |
| 4 | tm120801 | The Dirty Dozen | MOVIE | 12 American military prisoners in World War II... | 1967 | No Certification | 150 | war | United Kingdom | 0.0 | 7.7 | 72662.0 | 20.398 | 7.600 |
credits.shape
(5563, 14)
Pour terminer cette partie, nous vérifions qu'il ne reste aucune valeur nan dans credits.
(credits.isna().sum(axis=0)/credits.shape[0]*100).round(2).sort_values(ascending=False)
id 0.0 title 0.0 type 0.0 description 0.0 release_year 0.0 age_certification 0.0 runtime 0.0 genres 0.0 production_countries 0.0 seasons 0.0 imdb_score 0.0 imdb_votes 0.0 tmdb_popularity 0.0 tmdb_score 0.0 dtype: float64
Tout d'abord, nous aimerions examiner la répartition des rôles dans l'ensemble de données.
_ = plt.pie(x=titles["role"].value_counts(),
labels = titles["role"].value_counts().index,
autopct='%1.0f%%',
shadow=True,
explode=[0, 0.2],
colors=["#E50913", "black"],
startangle=35)
plt.title("Graph circulaire de la répartition des rôles dans les films")
Text(0.5, 1.0, 'Graph circulaire de la répartition des rôles dans les films')
Les autres colonnes de cet ensemble de données ont trop de résultats possibles pour permettre des visualisations univariées intéressantes.
Regardons le nombre de films sortis par an.
credits["release_year"].sort_values()
0 1945
22 1954
13 1954
23 1956
14 1958
...
4914 2022
5326 2022
4701 2022
4695 2022
4681 2022
Name: release_year, Length: 5563, dtype: int64
_ = plt.plot(credits.groupby(by="release_year").id.count(), c="crimson")
plt.title("Nombre de films sortis par année de 1945 à 2022")
plt.xlabel("Année de sortie", c="navy")
plt.ylabel("Nombre de films", c="navy")
Text(0, 0.5, 'Nombre de films')
De 1945 à 2020, le nombre de films a augmenté de façon exponentielle. La chute brutale en 2020 est certainement due au gel de l'économie mondiale par le covid.
Maintenant, nous pouvons examiner la répartition des colonnes qualitatives telles que le type, la certification d'âge et les genres.
f, ax = plt.subplots(2, 2, figsize=(20, 15))
#f.title("Frequencies in Type, Age Certification and Genres columns of credits")
width = 0.35
plt.delaxes(ax[1][1])
type_bar = credits.groupby(by="type").nunique()
x_type = np.arange(len(type_bar))
ax[0][0].bar(x_type, type_bar.id, color=["coral", "orangered"])
ax[0][0].set_xticks(x_type)
ax[0][0].set_xticklabels(type_bar.index)
ax[0][0].set_title("Fréquences des types de films")
age_certif_bar = credits.groupby(by="age_certification").nunique()
x_age_certif = np.arange(len(age_certif_bar))
ax[0][1].bar(x_age_certif, age_certif_bar.id, color=["cornflowerblue", "royalblue"])
ax[0][1].set_xticks(x_age_certif)
ax[0][1].set_xticklabels(age_certif_bar.index.map(mapAC_reverse))
ax[0][1].set_title("Fréquence des certifications d'âge parmi les films")
genre_bar = credits.groupby(by="genres").nunique()
x_genre = np.arange(len(genre_bar))
ax[1][0].bar(x_genre, genre_bar.id, color=["limegreen", "seagreen"])
ax[1][0].set_xticks(x_genre)
ax[1][0].set_xticklabels(genre_bar.index, rotation=50, horizontalalignment='right')
ax[1][0].set_title("Fréquence des genres parmi les films")
plt.tight_layout()
f, ax = plt.subplots(figsize=(20, 5))
pc_bar = credits.groupby(by="production_countries").nunique()
x_pc = np.arange(len(pc_bar))
ax.bar(x_pc, pc_bar.id, color=["gold", "goldenrod"])
ax.set_xticks(x_pc)
ax.set_xticklabels(pc_bar.index, rotation=50, horizontalalignment='right')
ax.set_title("Fréquence des pays de production parmi les films")
plt.tight_layout()
Les États-Unis sont fortement dominants (Hollywood) suivis par l'Inde (Bollywood), essayons de les retirer du graphique.
f, ax = plt.subplots(figsize=(20, 5))
pc_bar = credits[(credits["production_countries"]!="United States") & (credits["production_countries"]!="India")].groupby(by="production_countries").nunique()
x_pc = np.arange(len(pc_bar))
ax.bar(x_pc, pc_bar.id, color=["gold", "goldenrod"])
ax.set_xticks(x_pc)
ax.set_xticklabels(pc_bar.index, rotation=50, horizontalalignment='right')
ax.set_title("Fréquence des pays de production parmi les films (sans Etats Unis et Inde)")
plt.tight_layout()
Changeons maintenant de sujet et visulisons les données quantitatives. Nous allons d'abord nous concentrer sur les variables imdb et timbd.
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))
imdb_score = credits["imdb_score"]
imdb_votes = credits["imdb_votes"]
tmdb_v = credits[["tmdb_popularity", "tmdb_score"]].dropna()
bplot1 = ax1.boxplot(imdb_score,
vert=True, # vertical box alignment
patch_artist=True, # fill with color
labels=["imdb_score"]) # will be used to label x-ticks
ax1.set_title('Boite à moustage pour IMDB score')
bplot2 = ax2.boxplot(np.log(imdb_votes),
vert=True, # vertical box alignment
patch_artist=True, # fill with color
labels=["imdb_votes"]) # will be used to label x-ticks
ax2.set_title('Boite à moustage pour IMDB votes')
# fill with colors
colors = ['pink']
i=0
for bplot in (bplot1, bplot2):
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
# adding horizontal grid lines
for ax in [ax1, ax2]:
ax.yaxis.grid(True)
ax.set_ylabel('Valeurs observées')
plt.tight_layout()
Traçons maintenant ces variables avec des histogrammes.
f, axs = plt.subplots(1, 2)
logCredit = credits.copy()
logCredit["imdb_votes"] = np.log(logCredit["imdb_votes"])
credits.hist(column = "imdb_score", bins=20, color = "purple", zorder=2, rwidth=0.85, ax = axs[0])
logCredit.hist(column = "imdb_votes", bins=20, color = "purple", zorder=2, rwidth=0.85, ax = axs[1])
axs[0].set_title("Histogramme de imdb_score")
axs[0].set_xlabel("tmdb_score")
axs[1].set_title("Histogramme de imdb_votes")
axs[1].set_xlabel("log(imdb_votes)")
for elem in axs:
elem.spines['right'].set_visible(False)
elem.spines['top'].set_visible(False)
elem.spines['left'].set_visible(False)
elem.set_ylabel("Fréquences")
plt.tight_layout()
Nous pouvons faire la même chose pour les variables tmdb.
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))
tmdb_score = credits["tmdb_score"]
tmdb_popularity = credits["tmdb_popularity"]
bplot1 = ax1.boxplot(tmdb_score,
vert=True, # vertical box alignment
patch_artist=True, # fill with color
labels=["tmdb_score"]) # will be used to label x-ticks
ax1.set_title('Boite à moustache pour TMDB score')
bplot2 = ax2.boxplot(np.log(tmdb_popularity),
vert=True, # vertical box alignment
patch_artist=True, # fill with color
labels=["tmdb_popularity"]) # will be used to label x-ticks
ax2.set_title('Boite à moustache pour TMDB popularity')
# fill with colors
colors = ['pink']
i=0
for bplot in (bplot1, bplot2):
for patch, color in zip(bplot['boxes'], colors):
patch.set_facecolor(color)
# adding horizontal grid lines
for ax in [ax1, ax2]:
ax.yaxis.grid(True)
ax.set_ylabel('Valeurs observées')
plt.tight_layout()
Comme pour les variables imdb, créons des histogrammes de ces deux colonnes.
f, axs = plt.subplots(1, 2)
logCredit = credits.copy()
logCredit["tmdb_popularity"] = np.log(logCredit["tmdb_popularity"])
credits.hist(column = "tmdb_score", bins=20, color = "firebrick", zorder=2, rwidth=0.85, ax = axs[0])
logCredit.hist(column = "tmdb_popularity", bins=20, color = "firebrick", zorder=2, rwidth=0.85, ax = axs[1])
axs[0].set_title("Histogramme de tmdb_score")
axs[0].set_xlabel("tmdb_score")
axs[1].set_title("Histogramme de tmdb_popularity")
axs[1].set_xlabel("log(tmdb_popularity)")
for elem in axs:
elem.spines['right'].set_visible(False)
elem.spines['top'].set_visible(False)
elem.spines['left'].set_visible(False)
elem.set_ylabel("Fréquences")
plt.tight_layout()
Dans un premier temps, regardons le nombre de films fait par chaque pays depuis 1945.
map_data = credits[["production_countries", "release_year", "id"]]
grouped_countries = map_data.pivot_table(index="production_countries", columns="release_year", values="id", aggfunc="count")
grouped_countries.fillna(0, inplace=True)
grouped_countries_cumulative = grouped_countries.cumsum(axis=1)
geocon = credits.copy()
continent = []
map_country_code_reverse = {value:key for key, value in map_country_code.items()}
geocon["iso2"] = geocon["production_countries"].map(map_country_code_reverse)
#geocon[geocon["iso2"]=="SU"]["iso2"] = "RU"
for index, row in geocon.iterrows():
if row[8] == "RU":
continent.append("AS")
elif row[8] == "SU":
continent.append("AS")
elif row[8] == "Russia":
continent.append("AS")
elif row[8] == "Lebanon":
continent.append("AS")
elif row[8] == "Colombia":
continent.append("SA")
else:
continent.append(pc.country_alpha2_to_continent_code(map_country_code_reverse[row[8]]))
continents_dic = {
'NA': 'North America',
'SA': 'South America',
'AS': 'Asia',
'OC': 'Oceania',
'AF': 'Africa',
'EU': 'Europe'
}
geocon["Continent"] = continent
geocon["Continent"] = geocon["Continent"].map(continents_dic)
values = []
sum_countries = grouped_countries_cumulative.sum(axis=1)
for index, row in geocon.iterrows():
values.append(sum_countries[row[8]])
geocon["values"] = values
fig = px.treemap(geocon, path=[px.Constant("World"), 'Continent', 'production_countries'], values='values',
color='Continent', hover_data=['iso2'],
color_continuous_scale='RdBu',
title = "Tree Map du nombre de films fait par pays dans le monde depuis 1945")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()
Maintenant regardons le nombre de film par année pour chaque pays depuis 1945.
map_data = credits[["production_countries", "release_year", "id"]]
grouped_countries = map_data.pivot_table(index="production_countries", columns="release_year", values="id", aggfunc="count")
grouped_countries.fillna(0, inplace=True)
grouped_countries_cumulative = grouped_countries.cumsum(axis=1)
values = []
prevRow = 0
for index, row in map_data.iterrows():
values.append(grouped_countries_cumulative.loc[row[0], row[1]])
geodata = map_data.copy()
geodata["values"] = values
map_country_code_reverse = {value:key for key, value in map_country_code.items()}
geodata["iso"] = geodata["production_countries"].map(map_country_code_reverse)
country_info = pd.read_csv("countries_codes_and_coordinates.csv")
indexKOR = country_info[country_info["Country"]=="South Korea"]["Country"].index
country_info.loc[indexKOR.values[0]]["Country"] = "Korea"
indexCONGO = country_info[country_info["Country"]=="Congo"]["Country"].index
country_info.loc[indexCONGO.values[0]]["Country"] = "Congo, Democratic Republic"
indexIRAN = country_info[country_info["Country"]=="Iran, Islamic Republic of"]["Country"].index
country_info.loc[indexIRAN.values[0]]["Country"] = "Iran, Islamic Republic Of"
indexTanzania = country_info[country_info["Country"]=="Tanzania, United Republic of"]["Country"].index
country_info.loc[indexTanzania.values[0]]["Country"] = "Tanzania"
del country_info["Alpha-2 code"]
iso3=[]
lat=[]
lon=[]
i=0
for index, row in geodata.iterrows():
cur_info = country_info[country_info["Country"]==row[0]]
iso3.append(cur_info["Alpha-3 code"])
lat.append(cur_info["Latitude (average)"])
lon.append(cur_info["Longitude (average)"])
iso3_clean = []
lat_clean = []
lon_clean = []
for x in iso3:
iso3_clean.append(x.values[0].split("\"")[1].split("\"")[0])
for x in lat:
lat_clean.append(x.values[0].split("\"")[1].split("\"")[0])
for x in lon:
lon_clean.append(x.values[0].split("\"")[1].split("\"")[0])
geodata["iso3"] = iso3_clean
geodata["lat_clean"] = lat_clean
geodata["lon_clean"] = lon_clean
fig = px.scatter_geo(geodata.sort_values(by="release_year"), size="values",color= 'values',
animation_frame="release_year", lat="lat_clean", lon="lon_clean",range_color=(0, 2000),
projection="natural earth", size_max=40, title="Evolution of number of films per year")
fig.show()
Maintenant, observons la note imdb et tmdb par genre.
df = credits.groupby('genres').mean().T
fig = px.imshow(df.loc[['imdb_score','tmdb_score']],title="Note moyenne par genre", color_continuous_scale='reds')
fig.show()
Nous nous intéressons au nombre de films par genre.
df = credits.groupby('genres').count()
fig = px.line(df, x=df.index, y='id', title='Nombre de films et séries par genre',
color_discrete_map={"id":"black"})
fig.show()
Ensuite, nous pouvons observer la proportion de chaque genre et leur film associé.
df1 = credits[['id','title']]
df1 =df1.set_index('id')
df1 = df1.to_dict()
df1 = df1["title"]
df3 = credits[['id','genres']]
df3 =df3.set_index('id')
df3 = df3.to_dict()
df3 = df3["genres"]
df2 = titles.copy()
df2['name film'] = df2['id'].map(df1)
df2['genres'] = df2['id'].map(df3)
df2 = df2[['genres','name film','name']]
df2 = df2.dropna()
df2['values'] = 1
df3 = df2[:1000]
fig = px.sunburst(df3, path=['genres','name film'], values = 'values', title="Graphique solaire des films par genre")
fig.show()
Nous nous penchons sur la récurrence des acteurs dans les différents genres.
df1 = credits[['id','genres']]
df1 =df1.set_index('id')
df1 = df1.to_dict()
df1 = df1["genres"]
df2 = titles.copy()
df2['genres'] = df2['id'].map(df1)
df2 = df2[df2['role'] == 'ACTOR']
df3 = df2.groupby(['name','genres']).count()
df3 = df3.sort_values('role', ascending = False)[:10]
col1 = [df3.index[x][1] for x in range(len(df3.index))]
col2 = [df3.index[x][0] for x in range(len(df3.index))]
df4 = pd.DataFrame()
df4['acteur'] = col2
df4['genres'] = col1
df4['nombres'] = list(df3['id'])
fig = px.bar(df4, x='acteur', y='nombres', color = 'genres', title="TOP 10 des acteurs restés constant dans leur genre",
color_discrete_map={"drama":"#E50913","documentation":"black","comedy":"#8d9d9a"})
fig.show()
On observe les 10 acteurs les plus constant dans les genres (Fun fact, Barack Obama est l'acteur le plus récurent des documentaires).
On affiche ensuite les 10 acteurs ayant joué dans le plus de genres.
df2 = titles.copy()
df2['genres'] = df2['id'].map(df1)
df2 = df2[df2['role'] == 'ACTOR']
df3 = df2.groupby(['name','genres']).nunique().reset_index()
df3 = df3['name'].value_counts()
df3 = df3.to_dict()
df4 = df2.groupby(['name','genres']).nunique().reset_index()
df4 = df4[['name','genres']]
df4['values'] = df4['name'].map(df3)
df4 = df4.sort_values(['values','name'], ascending = False)
df5= df4[:79]
fig = px.sunburst(df5, path=['name','genres'], values = 'values',title = "TOP 10 des acteurs ayant joué dans le plus de genres différents")
fig.show()
Pour les prochains graphiques nous allons étudier les valeurs des séries et des films de manière séparée.
table = pd.pivot_table(credits,values = 'imdb_score' ,index = 'type', columns= 'release_year', aggfunc='mean', fill_value=0)
tableshow = table.drop('MOVIE', axis = 0)
for x in tableshow.columns:
if tableshow[x][0] == 0:
tableshow = tableshow.drop(x, axis = 1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=table.columns, y=table.loc['MOVIE'],
mode='lines',
name='MOVIE',
line_color="#E50913"))
fig.add_trace(go.Scatter(x=tableshow.columns, y=tableshow.loc['SHOW'],
mode='lines+markers',
name='SHOW',
line_shape='spline',
line_color="#0A0402" ))
fig.update_layout(
title="Evolution de la note des séries/films en fonction du temps",
xaxis_title="Années",
yaxis_title="Note moyenne des séries/films",
legend_title="Legende",
)
fig.show()
La courbe d'évolution de la note Imdb montre un renversement des series et film depuis une quinzaine d'année.
On affiche ensuite la courbe d'évolution du temps de série/films en fonction des années
table = pd.pivot_table(credits,values = 'runtime' ,index = 'type', columns= 'release_year', aggfunc='mean', fill_value=0)
tableshow = table.drop('MOVIE', axis = 0)
for x in tableshow.columns:
if tableshow[x][0] == 0:
tableshow = tableshow.drop(x, axis = 1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=table.columns, y=table.loc['MOVIE'],
mode='lines',
name='MOVIE',
line_color="#E50913"))
fig.add_trace(go.Scatter(x=tableshow.columns, y=tableshow.loc['SHOW'],
mode='lines+markers',
name='SHOW',
line_shape='spline',
line_color="#0A0402"))
fig.update_layout(
title="2volution du temps des séries/films en fonction du temps",
xaxis_title="Années",
yaxis_title="temps moyenne des séries/films",
legend_title="Legende",
)
fig.show()
On observe une convergence entre la durée moyenne des séries qui augmente et celle des films qui diminue.
On va s'interesser ensuite au graph de la note imdb par rapport a l'année et à la quantité de séries/films.
df = credits.groupby(by="release_year").id.count()
df1 = pd.DataFrame()
df1["x"] = df.index
df1['y'] = df.values
df3 = credits.groupby('release_year').mean()
df3.reset_index(drop= True)
df1.reset_index(drop= True)
l = list(df3['imdb_score'])
df1['z'] = l
fig = go.Figure()
fig = go.Figure(data=[go.Mesh3d(x=df1.x,
y=df1.y,
z=df1.z,
opacity=1,
color='#E50913'
)])
fig.update_layout(
title="Evolution de la note et de la quantité de séries/films en fonction du temps",
)
fig.update_layout({
'plot_bgcolor': 'rgba(0,0,0,1)',
'paper_bgcolor': 'rgba(0,0,0,0)',
})
fig.show()
On observe que le graph n'est pas très lisible, on le trace d'une autre maniere.
df1['color'] = 'red'
fig = go.Figure(data=[go.Scatter3d(x=df1.x, y=df1.y, z=df1.z,
mode='markers',
line_color="#E50913"
)])
fig.update_layout(
title="Evolution de la note et de la quantité de séries/films en fonction du temps",
)
fig.show()
Dans un premier temps, on observe les flux des acteurs/réalisateurs entre les films et les series au cours du temps.
map_migAct_type = credits[['id','type']]
map_migAct_type = map_migAct_type.set_index('id')
map_migAct_type = map_migAct_type.to_dict()
map_migAct_type = map_migAct_type["type"]
map_migAct_year = credits[['id','release_year']]
map_migAct_year = map_migAct_year.set_index('id')
map_migAct_year = map_migAct_year.to_dict()
map_migAct_year = map_migAct_year["release_year"]
migAct = titles.copy()
migAct["type"] = migAct["id"].map(map_migAct_type)
migAct["year"] = migAct["id"].map(map_migAct_year)
migAct_table = migAct.pivot_table(index="year", columns="type", values="id", aggfunc="count").fillna(0)
migAct_table
| type | MOVIE | SHOW |
|---|---|---|
| year | ||
| 1954.0 | 35.0 | 0.0 |
| 1956.0 | 7.0 | 0.0 |
| 1958.0 | 29.0 | 0.0 |
| 1959.0 | 9.0 | 0.0 |
| 1960.0 | 6.0 | 0.0 |
| ... | ... | ... |
| 2018.0 | 6976.0 | 1953.0 |
| 2019.0 | 7920.0 | 1997.0 |
| 2020.0 | 7710.0 | 2192.0 |
| 2021.0 | 7260.0 | 2158.0 |
| 2022.0 | 2627.0 | 1339.0 |
62 rows × 2 columns
fig = px.bar(migAct_table, color="type", height=400, title="Type de formats choisis par les acteurs/réalisateurs par année",
color_discrete_map={"MOVIE":"#E50913","SHOW":"black"})
fig.show()
A partir du graphique précédent, on remarque que la proportion d'acteurs choisant de jouer dans des films baisse avec les années. On créer un indicateur pour signifier ce ratio de film ou de série faits par les acteurs/réalisateurs chaque année.
prop_movie = [row[0]/(row[0]+row[1]) for index, row in migAct_table.iterrows()]
migAct_table["Prop_Movie"] = prop_movie
migAct_table
| type | MOVIE | SHOW | Prop_Movie |
|---|---|---|---|
| year | |||
| 1954.0 | 35.0 | 0.0 | 1.000000 |
| 1956.0 | 7.0 | 0.0 | 1.000000 |
| 1958.0 | 29.0 | 0.0 | 1.000000 |
| 1959.0 | 9.0 | 0.0 | 1.000000 |
| 1960.0 | 6.0 | 0.0 | 1.000000 |
| ... | ... | ... | ... |
| 2018.0 | 6976.0 | 1953.0 | 0.781274 |
| 2019.0 | 7920.0 | 1997.0 | 0.798629 |
| 2020.0 | 7710.0 | 2192.0 | 0.778631 |
| 2021.0 | 7260.0 | 2158.0 | 0.770864 |
| 2022.0 | 2627.0 | 1339.0 | 0.662380 |
62 rows × 3 columns
migAct_table_dic = migAct_table["Prop_Movie"]
migAct_table_dic = migAct_table_dic.to_dict()
credits["Prop_Movie"] = credits["release_year"].map(migAct_table_dic).fillna(0)
px.scatter(x=credits.release_year,
y=credits.Prop_Movie,
title="Evolution de la proportion des acteurs/réalisateurs faisant des films chaque année",
labels={
"x":"Year",
"y":"Indicateur migration"},
trendline="lowess")
On crée un indicateur qui va prendre en compte les quatres colonnes suivantes: imdb_score, imdb_votes, tmdb_popularity, tmdb_score.
Dans un premier temps on utilise le Bayesian Average decrit ici: https://www.algolia.com/doc/guides/managing-results/must-do/custom-ranking/how-to/bayesian-average/
avg = credits['imdb_votes']*credits['imdb_score']
m = avg.sum()/credits['imdb_votes'].sum()
df = credits.sort_values('imdb_votes', ascending = False)
index = credits.shape[0]*0.25
index = int(index)
c = df.loc[index]['imdb_votes']
df['bayesAvg'] = (df['imdb_votes']*df['imdb_score']+c*m)/(df['imdb_votes']+c)
On additionne les 3 colonnes en prenant le log de tmdb_popularity pour avoir un distribution normale.
df['usr_indic'] = (df['bayesAvg'] + df['tmdb_score'] + np.log(df['tmdb_popularity']+1))/3
fig = px.histogram(df, x="usr_indic")
fig.show()
Pour finir avec ce nouvel indicateur, nous pouvons voir l'évolution de la qualité des séries et films au cours du temps.
table = df.pivot_table(index="type", columns="release_year", values="usr_indic", aggfunc="mean").fillna(0)
tableshow = table.drop('MOVIE', axis = 0)
for x in tableshow.columns:
if tableshow[x][0] == 0:
tableshow = tableshow.drop(x, axis = 1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=table.columns, y=table.loc['MOVIE'],
mode='lines',
name='MOVIE',
line_color="#E50913"))
fig.add_trace(go.Scatter(x=tableshow.columns, y=tableshow.loc['SHOW'],
mode='lines+markers',
name='SHOW',
line_shape='spline',
line_color="#0A0402" ))
fig.update_layout(
title="Evolution de la note des series/films en fonction du temps",
xaxis_title="Années",
yaxis_title="Note moyenne des series/films",
legend_title="Legende",
)
fig.show()
Nous avons commencer par étudier la migration des acteurs/réalisateurs de film vers série nous avons observé une tendance exponentielle de migration ce qui est un gage de qualité des séries par rapport aux films. De plus, nous avons créé un nouvel indicateur qui a permis d'étudier l'évolution de la préférence des spectateurs des films et séries au file des années. Il en découle que, depuis 1994, les series sont plus apréciées que les films.
Fun fact: 1994 est la date de sortie de la serie Friends
map_data = credits[["production_countries", "release_year", "id"]]
grouped_countries = map_data.pivot_table(index="production_countries", columns="release_year", values="id", aggfunc="count")
grouped_countries.fillna(0, inplace=True)
grouped_countries_cumulative = grouped_countries.cumsum(axis=1)
values = []
prevRow = 0
for index, row in map_data.iterrows():
values.append(grouped_countries_cumulative.loc[row[0], row[1]])
geodata = map_data.copy()
geodata["values"] = values
map_country_code_reverse = {value:key for key, value in map_country_code.items()}
geodata["iso"] = geodata["production_countries"].map(map_country_code_reverse)
country_info = pd.read_csv("countries_codes_and_coordinates.csv")
indexKOR = country_info[country_info["Country"]=="South Korea"]["Country"].index
country_info.loc[indexKOR.values[0]]["Country"] = "Korea"
indexCONGO = country_info[country_info["Country"]=="Congo"]["Country"].index
country_info.loc[indexCONGO.values[0]]["Country"] = "Congo, Democratic Republic"
indexIRAN = country_info[country_info["Country"]=="Iran, Islamic Republic of"]["Country"].index
country_info.loc[indexIRAN.values[0]]["Country"] = "Iran, Islamic Republic Of"
indexTanzania = country_info[country_info["Country"]=="Tanzania, United Republic of"]["Country"].index
country_info.loc[indexTanzania.values[0]]["Country"] = "Tanzania"
del country_info["Alpha-2 code"]
iso3=[]
lat=[]
lon=[]
i=0
for index, row in geodata.iterrows():
cur_info = country_info[country_info["Country"]==row[0]]
iso3.append(cur_info["Alpha-3 code"])
lat.append(cur_info["Latitude (average)"])
lon.append(cur_info["Longitude (average)"])
iso3_clean = []
lat_clean = []
lon_clean = []
for x in iso3:
iso3_clean.append(x.values[0].split("\"")[1].split("\"")[0])
for x in lat:
lat_clean.append(x.values[0].split("\"")[1].split("\"")[0])
for x in lon:
lon_clean.append(x.values[0].split("\"")[1].split("\"")[0])
geodata["iso3"] = iso3_clean
geodata["lat_clean"] = lat_clean
geodata["lon_clean"] = lon_clean
geodata["cont"] = continent
geodata["cont"] = geodata["cont"].map(continents_dic)
dic_rep_cont = {
"North America":"Americas",
"South America":"Americas"
}
geodata["cont"] = geodata["cont"].replace(dic_rep_cont)
geodata.sort_values(by =["cont","production_countries"])
geodata2 = geodata[['release_year','cont','production_countries','values','lat_clean','lon_clean']]
geodata2 = geodata2.sort_values(by =["production_countries","release_year"])
geodata2 =geodata2.reset_index(drop = True)
fig = px.scatter_geo(geodata2, size="values", color="cont", # which column to use to set the color of markers
hover_name="production_countries",
animation_frame="release_year", lat="lat_clean", lon="lon_clean",
projection="natural earth", size_max=40, title="Evolution of number of films per year")
fig.show()